Setting Up:

library(tidyverse)
library(janitor)
library(leaflet)
library(readxl)
library(easystats)

Loading the Data:

df <- read_csv("./ufo_data.csv") %>% 
  janitor::clean_names()
glimpse(df)
## Rows: 80,332
## Columns: 11
## $ datetime           <chr> "10/10/1949 20:30", "10/10/1949 21:00", "10/10/1955…
## $ city               <chr> "san marcos", "lackland afb", "chester (uk/england)…
## $ state              <chr> "tx", "tx", NA, "tx", "hi", "tn", NA, "ct", "al", "…
## $ country            <chr> "us", NA, "gb", "us", "us", "us", "gb", "us", "us",…
## $ shape              <chr> "cylinder", "light", "circle", "circle", "light", "…
## $ duration_seconds   <dbl> 2700, 7200, 20, 20, 900, 300, 180, 1200, 180, 120, …
## $ duration_hours_min <chr> "45 minutes", "1-2 hrs", "20 seconds", "1/2 hour", …
## $ comments           <chr> "This event took place in early fall around 1949-50…
## $ date_posted        <chr> "4/27/2004", "12/16/2005", "1/21/2008", "1/17/2004"…
## $ latitude           <dbl> 29.88306, 29.38421, 53.20000, 28.97833, 21.41806, 3…
## $ longitude          <dbl> -97.941111, -98.581082, -2.916667, -96.645833, -157…

As we can see above, there is some cleaning to be done before we get started on analyzing any data.

Tidying the Data:

This analysis is only going to focus on UFO sightings within the United States, so I need the data to reflect that.

df <- df %>%
  mutate(country = case_when(
    state %in% c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "dc", "fl", 
                 "ga", "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", 
                 "md", "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", 
                 "nj", "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", 
                 "sc", "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") ~ "us",
    TRUE ~ "other"
  ))

df <- df %>% 
  filter(country == "us")

One would expect the above code to adequately filter submissions but when looking at latitudes and longitudes, there are still some entries that fall outside of the United States. To fix that, I’m going to create some boundaries.

continental_us <- list(
  xmin = -125.0,  # Westernmost point 
  xmax = -66.93457,  # Easternmost point
  ymin = 24.396308,  # Southernmost point
  ymax = 49.384358  # Northernmost point
)

alaska_hawaii <- list(
  xmin = -178.2166,  # Westernmost point
  xmax = -129.9943,  # Easternmost point
  ymin = 18.9117,  # Southernmost point
  ymax = 71.5388  # Northernmost point 
)

continental_us_data <- df %>%
  filter(latitude >= continental_us$ymin & latitude <= continental_us$ymax &
           longitude >= continental_us$xmin & longitude <= continental_us$xmax)

alaska_hawaii_data <- df %>%
  filter(latitude >= alaska_hawaii$ymin & latitude <= alaska_hawaii$ymax &
           longitude >= alaska_hawaii$xmin & longitude <= alaska_hawaii$xmax)

df <- rbind(continental_us_data, alaska_hawaii_data)

Another problem with this data is the shapes of UFOs.

unique(df$shape)
##  [1] "cylinder"  "light"     "circle"    "sphere"    "disk"      "fireball" 
##  [7] "unknown"   "oval"      "other"     "rectangle" "chevron"   "formation"
## [13] "triangle"  "cigar"     NA          "delta"     "changing"  "diamond"  
## [19] "flash"     "egg"       "teardrop"  "cone"      "cross"     "pyramid"  
## [25] "round"     "crescent"  "flare"     "hexagon"   "dome"      "changed"

There are a lot of values that could mean the same thing, so we’re going to group those together.

df <- df %>% 
  mutate(shape = case_when(
  shape %in% c("light", "fireball", "flash", "flare") ~ "light",
  shape %in% c("circle", "sphere", "egg", "oval", "disk", "round") ~ "spherical",
  shape %in% c("cylinder", "rectangle", "cigar") ~ "rectangular",
  shape %in% c("triangle", "pyramid") ~ "triangular",
  is.na(shape) | shape %in% c("unknown", "other", "changing", "changed", "formation") ~ "other",
  shape %in% c("delta", "chevron") ~ "delta",
  shape %in% c("diamond", "hexagon") ~ "diamond",
  shape %in% c("cone", "dome") ~ "cone",
  TRUE ~ as.character(shape)))

unique(df$shape)
##  [1] "rectangular" "light"       "spherical"   "other"       "delta"      
##  [6] "triangular"  "diamond"     "teardrop"    "cone"        "cross"      
## [11] "crescent"

Visualizing the Data:

First, let’s put up a map of all the UFO sightings in the United States:

Which U.S. state/territory has the most UFO sightings?

df %>% 
  mutate(state = toupper(state)) %>% 
  filter(state!=is.na(state)) %>% 
  group_by(state) %>% 
  summarize(N = n()) %>% 
  ggplot(aes(x = reorder(state, -N),
             y = N)) +
  geom_col(aes(fill = state)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, 
                                   vjust = 0.4, 
                                   hjust = 1)) +
  labs(title = "Number of UFO Sightings by US State/Territory",
       x = "State",
       y = "Number of UFO Sightings") +
  guides(fill = "none")

It makes a lot of sense that California, the most populous state, would have the most UFO sightings.Let’s see if we can find out which state has the highest percentage of UFO sightings compared to population.

state_data <- df %>% 
  mutate(state = toupper(state)) %>% 
  filter(state!=is.na(state)) %>% 
  group_by(state) %>% 
  summarize(N = n())

state_pop <- data.frame(
  state = toupper(c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "dc", "fl", 
                    "ga", "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", 
                    "md", "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", 
                    "nj", "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", 
                    "sc", "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy")),
  pop = c(4779736, 710231, 6392017, 2915918, 37253956, 5029196, 3574097, 897934, 601723, 18801310,
          9687653, 1360301, 1567582, 12830632, 6483802, 3046355, 2853118, 4339367, 4533372, 1328361,
          5773552, 6547629, 9883640, 5303925, 2967297, 5988927, 989415, 1826341, 2700551, 1316470,
          8791894, 2059179, 19378102, 9535483, 672591, 11536504, 3751351, 3831074, 12702379, 1052567,
          4625364, 814180, 6346105, 25145561, 2763885, 625741, 8001024, 6724540, 1852994, 5686986, 563626))

state_merge <- state_data %>% 
  left_join(state_pop, by = 'state')

state_merge <- state_merge %>% 
  mutate(percent_per_capita = (N/pop)*100)

state_merge %>% 
  ggplot(aes(x = reorder(state, -percent_per_capita),
             y = percent_per_capita)) +
  geom_col(aes(fill = state)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.4, hjust = 1),
        plot.title = element_text(face = "bold", size = 15)) +
  labs(title = "UFO Sightings per Capita by US State/Territory",
       x = "State",
       y = "UFO Sightings per Capita") +
  guides(fill = "none")

Washington, which was second highest in total number of sightings, actually has the highest percentage of sightings to population.

Now let’s figure out what the most common shape of UFO seen in the U.S. is.

most_common_shape <- df %>%
  count(shape) %>%
  arrange(desc(n))

head(most_common_shape, n = 1)
## # A tibble: 1 × 2
##   shape     n
##   <chr> <int>
## 1 light 21358

Not entirely surprisingly, the most common UFO seen is light, with over 21,000 sightings.

What’s the average duration of a UFO sighting?

mean(df$duration_seconds, na.rm = TRUE)
## [1] 5727.521

5727.5 seconds is over an 1.5 hours, which is a pretty long time to watch a UFO fly overhead.

Let’s answer a few more questions: How has the frequency of UFO sightings changed over the years?

df %>%
  mutate(year = as.integer(year)) %>%
  count(year) %>%
  ggplot(aes(x = year, y = n)) +
  geom_line() +
  theme_minimal() +
  labs(title = "Frequency of UFO Sightings Over the Years",
       x = "Year",
       y = "Number of Sightings")

Sightings peaked around 2012, before a rapid drop in occurrence.

What season do sightings occur in most?

get_season <- function(month) {
  if (month %in% c(12, 1, 2)) {
    return("Winter")
  } else if (month %in% c(3, 4, 5)) {
    return("Spring")
  } else if (month %in% c(6, 7, 8)) {
    return("Summer")
  } else {
    return("Fall")
  }
}

df <- df %>%
  mutate(season = sapply(month, get_season))

sightings_by_season <- df %>%
  group_by(season) %>%
  summarize(count = n())

ggplot(sightings_by_season, aes(x = season, y = count, fill = season)) +
  geom_col() +
  theme_minimal() +
  labs(title = "UFO Sightings by Season",
       x = "Season",
       y = "Number of Sightings",
       fill = "Season") +
  theme(legend.position = "none")

There are more sightings in the summer than there are in any other season, presumably because people are far more comfortable being outside late at night in warmer temperatures.

We know that summertime has the most sightings, but in what month do the most UFO sightings occur?

sightings_by_month <- df %>% 
  group_by(month) %>% 
  summarize(count = n())

ggplot(sightings_by_month, aes(x = month, y = count, fill = month)) +
  geom_col() +
  labs(title = "UFO Sightings per Month",
       x = "Month",
       y = "Number of Sightings") +
  theme_minimal() +
  theme(legend.position = "none")

Summer was defined as months 6, 7, and 8, which are June, July, and August respectively. Not shockingly, those same three months are have the highest rate of sightings, with July having the highest. There are a lot of reasons why this could be: more drinking, more partying, more fireworks, or just being outside more.

Let’s do something fun! I have a hypothesis that UFO sightings are more common near military bases, so we’ll plot military bases on that UFO sighting map from earlier:

mil_dat <- read_xlsx("military-bases.xlsx") %>% 
  janitor::clean_names()

mil_dat <- mil_dat %>% 
  select(geo_point, component, site_name, oper_stat)

mil_dat <- mil_dat %>% 
  separate(geo_point, into = c("latitude", "longitude"), sep = ", ", convert = TRUE)

There are a lot of UFO sightings around military bases, but there are also a lot of UFO sightings in areas with none. We can draw one of two conclusions from this:

  1. There is no visible relationship between UFO sightings and military bases.

  2. The areas with a lot of UFO sightings and no military bases actually do have military bases, but they aren’t listed.

I’ll let you decide which is more likely.

Data Modeling: